package cn.chiship.framework.business.crawler;

import cn.chiship.framework.common.util.TrustAllCertManager;
import cn.chiship.sdk.core.base.BaseResult;
import cn.chiship.sdk.core.id.SnowflakeIdUtil;
import cn.chiship.sdk.core.util.*;
import cn.chiship.sdk.core.util.http.HttpUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 获取央视新闻 http://news.cctv.com/china/
 */
public class CctvNewCrawler {

	JdbcUtil jdbcUtil = null;

	public void crawlerList(List<JSONObject> infos) {
		jdbcUtil = new JdbcUtil("com.mysql.cj.jdbc.Driver",
				"jdbc:mysql://localhost:3306/chiship_common_simple?useUnicode=true&characterEncoding=UTF8&useSSL=false&serverTimezone=Asia/Shanghai",
				"root", "123456");
		infos.forEach(json -> {
			System.out.println("正在爬取【" + json.getString("name") + "】新闻");
			crawlerList(json.getString("type"), json.getString("id"), 1);
		});
	}

	public void crawlerList(String type, String categoryId, Integer page) {
		System.out.println("正在爬取【" + type + "\t第" + page + "页】数据");
		BaseResult baseResult = HttpUtil.getInstance()
				.doGet("https://news.cctv.com/2019/07/gaiban/cmsdatainterface/page/" + type + "_" + page + ".jsonp");
		String data = baseResult.getData().toString().replace(type + "(", "");
		data = data.substring(0, data.length() - 1);
		JSONObject resultJson = JSONObject.parseObject(data);
		int total = resultJson.getJSONObject("data").getInteger("total");
		JSONArray contentList = resultJson.getJSONObject("data").getJSONArray("list");
		/**
		 * { "id": "ARTImRU0ithOq4rGfGQmituC241020", "title": "“China Travel”持续火爆：促进消费活力
		 * 增强中外交流", "focus_date": "2024-10-20 16:11:34", "url":
		 * "https://news.cctv.com/2024/10/20/ARTImRU0ithOq4rGfGQmituC241020.shtml",
		 * "image":
		 * "https://p1.img.cctvpic.com/photoworkspace/2024/10/20/2024102015445795789.jpg",
		 * "image2": "", "image3": "", "brief": "随着中国免签“朋友圈”持续扩容、便利人员往来措施不断优化，“China
		 * Travel”持续火爆，今年三季度，中国入境外国人超过800万人次，其中通过免签入境近500万人次，同比大幅上升。", "ext_field": "",
		 * "keywords": "China Travel", "count": "" }
		 */
		for (int c = 0; c < contentList.size(); c++) {
			JSONObject contentJson = contentList.getJSONObject(c);
			String title = contentJson.getString("title");
			String keywords = contentJson.getString("keywords");
			String focusDate = contentJson.getString("focus_date");
			String url = contentJson.getString("url");
			String image = contentJson.getString("image");
			String brief = contentJson.getString("brief");

			Long dateTime = DateUtils.dateTime(DateUtils.YYYY_MM_DD_HH_MM_SS, focusDate).getTime();
			Long yesterdayBegin = DateUtils
					.dateTime(DateUtils.YYYY_MM_DD_HH_MM_SS, DateUtils.getYesterday() + " 00:00:00").getTime();
			Long yesterdayEnd = DateUtils
					.dateTime(DateUtils.YYYY_MM_DD_HH_MM_SS, DateUtils.getYesterday() + " 23:59:59").getTime();

			if (dateTime >= yesterdayBegin && dateTime <= yesterdayEnd) {
				System.out.println("正在爬取：" + title + "\t" + focusDate);
				Map<String, String> resultMap = crawlerContent(url);
				String content = resultMap.get("content");
				String desc = resultMap.get("desc");
				String author = resultMap.get("author");
				String sql = String.format(
						"INSERT INTO `content_article`(`id`, `gmt_created`, `gmt_modified`, `category_id`, `image1`, `title`, summary,"
								+ "`keywords`, `status`, `publish_date`, `source`, `source_url`, `author`,meta_description,content) "
								+ "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
				List<Object> params = new ArrayList<>();
				params.add(SnowflakeIdUtil.generateStrId());
				params.add(dateTime);
				params.add(dateTime);
				params.add(categoryId);
				params.add(image);
				params.add(title);
				params.add(brief);
				params.add(keywords);
				params.add(Byte.valueOf("2"));
				params.add(dateTime);
				params.add("央视新闻网");
				params.add(url);
				params.add(author);
				params.add(desc);
				params.add(content);
				try {
					jdbcUtil.updateByParams(sql, params);
				}
				catch (SQLException e) {
					System.out.println("报错了" + contentJson.toJSONString());
				}
			}

		}
		if (page == 1) {
			int limit = contentList.size();
			int pages = 1;
			if (total % limit == 0) {
				pages = total / limit;
			}
			else {
				pages = (total / limit) + 1;
			}
			for (int i = 2; i <= pages; i++) {
				crawlerList(type, categoryId, i);
			}
		}
		// PrintUtil.console(resultJson);
	}

	public Map<String, String> crawlerContent(String url) {
		Map<String, String> resultMap = new HashMap<>(7);
		String desc = null;
		String content = null;
		String author = null;

		try {
			TrustManager[] trustManagers = new TrustManager[] { new TrustAllCertManager() };
			SSLContext sslContext = null;
			sslContext = SSLContext.getInstance("SSL");
			sslContext.init(null, trustManagers, new java.security.SecureRandom());
			HttpsURLConnection.setDefaultSSLSocketFactory(sslContext.getSocketFactory());
			Document document = Jsoup.connect(url).get();
			Elements metas = document.select("meta[name=description]");
			if (!metas.isEmpty()) {
				desc = metas.get(0).attr("content");
			}
			Element contentArea = document.getElementById("content_area");
			if (ObjectUtil.isNotEmpty(contentArea)) {
				content = contentArea.html();
			}

			Elements zebians = document.getElementsByClass("zebian");
			if (!zebians.isEmpty()) {
				author = zebians.get(0).child(1).text().replace("编辑：", "");
			}
			System.out.println(author);
		}
		catch (Exception e) {
		}
		resultMap.put("desc", desc);
		resultMap.put("author", author);
		resultMap.put("content", content);
		return resultMap;
	}

	public static void main(String[] args) {
		CctvNewCrawler cctvNewCrawler = new CctvNewCrawler();
		List<JSONObject> jsons = new ArrayList<>();
		JSONObject json = new JSONObject();
		json.put("type", "china");
		json.put("name", "国内");
		json.put("id", "720234357917097984");
		jsons.add(json);
		json = new JSONObject();
		json.put("type", "world");
		json.put("name", "国际");
		json.put("id", "1007094406795747328");
		jsons.add(json);

		json = new JSONObject();
		json.put("type", "economy_zixun");
		json.put("name", "经济");
		json.put("id", "1007094503747084288");
		jsons.add(json);

		json = new JSONObject();
		json.put("type", "society");
		json.put("name", "社会");
		json.put("id", "1007094464723279872");
		jsons.add(json);

		json = new JSONObject();
		json.put("type", "law");
		json.put("name", "法治");
		json.put("id", "1007094896220692480");
		jsons.add(json);

		json = new JSONObject();
		json.put("type", "ent");
		json.put("name", "文娱");
		json.put("id", "1007094636672966656");
		jsons.add(json);

		json = new JSONObject();
		json.put("type", "tech");
		json.put("name", "科技");
		json.put("id", "1008250015805071360");
		jsons.add(json);

		json = new JSONObject();
		json.put("type", "life");
		json.put("name", "生活");
		json.put("id", "1007094926298046464");
		jsons.add(json);

		json = new JSONObject();
		json.put("type", "edu");
		json.put("name", "教育");
		json.put("id", "1007094780541788160");
		jsons.add(json);
		System.out.println(DateUtils.dateTime(DateUtils.YYYY_MM_DD_HH_MM_SS, DateUtils.getYesterday() + " 00:00:00"));
		cctvNewCrawler.crawlerList(jsons);
		// cctvNewCrawler.crawlerContent("https://news.cctv.com/2024/10/19/ARTIo782PSjVdef2BdfmYw8E241019.shtml?spm=C94212.PZd4MuV7QTb5.Euuu2IJOvZIL.246");
	}

}
